#!/usr/bin/env python
from Claim import Claim
from create_data_sets import DatasetAcess

data = DatasetAcess()
#data.create_new_data_sets()
training = data.get_training_set()
training.extend( data.get_validation_set() )
n = len(Claim.vocabulary)

output = open('dataset_for_orange.tab','w')

header = 'category'
for i in xrange(n):
	header+='\tw'+str(i)
header+='\n'
output.write(header)
#write type features
output.write('d\t')
output.write(''.join( 'c\t' for i in xrange(n) ))
output.write('\n')
output.write('class\n')

for claim in training:
	line = claim.get_category().encode('utf-8').replace(' ','_')
	for count in claim.vectorized_form():
		line+='\t'+str(count)
	line+='\n'
	output.write(line)

output.close()
